import json
from string import Template
import types
from request import Request
from spider import Spider


class BaseProcess(object):
    start_url = []
    useProxy = False
    headers = None

    def __generate_url(self):
        range_start = int(self.__config["range"].split("-")[0])
        range_end = int(self.__config["range"].split("-")[1])
        primary_url = Template(self.__config["url"])
        for i in range(range_start, range_end):
            yield primary_url.safe_substitute(range=i)

    def __init__(self, config_file):
        with open("./spidercfg/" + config_file) as cfg:
            self.__config = json.load(cfg)
        self.__dict__["print"] = types.MethodType(print, BaseProcess)
        self.start_url = self.__generate_url()
        self.useProxy = False

    def next_processor(self, *args):
        funcName = "nextProcess" if len(args) < 3 else args[2]
        cfg_items = self.__config["processorFn"][funcName]["item"]
        returnType = self.__config["processorFn"][funcName]["returnType"]
        nextMethodName = self.__config["processorFn"][funcName]["nextProcess"]
        # 如果有下一步的处理函数，动态生产该函数
        if nextMethodName != "None":
            self.__initDynamicMethod(nextMethodName)
        items = {}
        html = args[1]
        elements, parserType, json_result = self.element_parser(funcName, html)
        print("count:", len(elements))
        for element in elements:
            for key, value in cfg_items.items():
                # 如果是用xpath获取
                if parserType == "lxml":
                    items[key] = element.xpath(value)[0]
                elif parserType == "regex":
                    # 否则认为是用正则表达式获取
                    items[key] = element[int(value)]
                elif parserType == "json":
                    from jsonpath_rw import parse
                    # if json_result is None:
                    items[key] = parse(value).find(json_result)[0].value

            # 如果返回类型是 DownerLoader， 就生成Request对象继续接下来的爬去，否则就是爬去对象
            if returnType == "DownerLoader":
                url = Template(self.__config["processorFn"][funcName]["args"]["url"])
                url = url.safe_substitute(items)
                request = Request(url=url, callback=self.__dict__[nextMethodName])
                yield request
            else:
                yield items

    def element_parser(self, funcName, html):
        cfg = self.__config
        parserType = cfg["processorFn"][funcName]["parserType"]
        json_result = None
        if parserType == "lxml":
            from lxml import etree
            ahtml = etree.HTML(html)
            xpath = self.__config["processorFn"][funcName]["xpath"]
            elements = ahtml.xpath(xpath)
        elif parserType == "regex":
            import re
            regex = self.__config["processorFn"][funcName]["xpath"]
            pattern = re.compile(regex, re.S)
            elements = re.findall(pattern, html)
        elif parserType == "json":
            import json
            from jsonpath_rw import parse
            json_result = json.loads(html)
            xpath = self.__config["processorFn"][funcName]["xpath"]
            successCheck = self.__config["processorFn"][funcName]["successCheck"]
            isSuccess = True
            elements = []
            if successCheck is not None:
                express = self.__config["processorFn"][funcName]["successCheck"]["express"]
                checkValue = self.__config["processorFn"][funcName]["successCheck"]["checkValue"]
                isSuccess = eval("parse(express).find(json_result)[0].value == checkValue")
                print("-----------isuccess", isSuccess)
            if isSuccess:
                obj = parse(xpath)
                for ele in obj.find(json_result):
                    if isinstance(ele.value, list):
                        elements = ele.value
                    else:
                        elements = [ele.value]
        return elements, parserType, json_result

    def __initDynamicMethod(self, methodName):
        self.__dict__[methodName] = types.MethodType(self.next_processor, self)

    def process(self, html):
        funcName = "mainProcess"
        return self.next_processor(self, html, funcName)

    def get_pipeline(self):
        pipe_items = []
        pipelineitems = self.__config["pipelineItems"]
        from console_pipeline import ConsolePipeline
        pipe_items.append(ConsolePipeline())
        for pipeline in pipelineitems:
            if "MongoDBPipeline" == pipeline:
                from mongo_pipeline import MongoDBPipeline
                pipe_items.append(MongoDBPipeline("video", "bilibili"))
            elif "FileDownloadPipeline" == pipeline:
                from file_download_pipeline import FileDownloadPipeline
                pipe_items.append(FileDownloadPipeline())
            elif "FileWriterPipeline" == pipeline:
                from file_writer_pipeline import FileWriterPipeline
                pipe_items.append(FileWriterPipeline())
        return pipe_items

    def get_threadOption(self):
        thread_option = self.__config["threadOption"]
        if thread_option is None or thread_option["multiplethread"] == "False":
            return None
        option = {}
        for key, value in thread_option.items():
            if key == "multiplethread":
                option[key] = (value == "True")
            option[key] = value
        return option


if __name__ == "__main__":
    basePrcess = BaseProcess("bilibili.json")
    threadOption = basePrcess.get_threadOption()
    spider = Spider(basePrcess, threadoptions=threadOption)
    # Spider(basePrcess).addPipeline(ConsolePipeline()).start()
    for pipeline in basePrcess.get_pipeline():
        print(pipeline)
        spider.addPipeline(pipeline)
    spider.start()
